Enter the directory of the maca folder on your drive and the name of the tissue you want to analyze.
tissue_of_interest = "Liver"
Load the requisite packages and some additional helper functions.
library(here)
here() starts at /Users/olgabot/code/tabula-muris
library(useful)
Loading required package: ggplot2
library(Seurat)
Loading required package: cowplot
Attaching package: 'cowplot'
The following object is masked from 'package:ggplot2':
ggsave
Loading required package: Matrix
Warning: namespace 'Biobase' is not available and has been replaced
by .GlobalEnv when processing object 'call.'
Warning: namespace 'lme4' is not available and has been replaced
by .GlobalEnv when processing object 'call.'
Warning: namespace 'MatrixModels' is not available and has been replaced
by .GlobalEnv when processing object 'call.'
Warning: namespace 'Biobase' is not available and has been replaced
by .GlobalEnv when processing object 'call.'
Warning: namespace 'lme4' is not available and has been replaced
by .GlobalEnv when processing object 'call.'
Warning: namespace 'MatrixModels' is not available and has been replaced
by .GlobalEnv when processing object 'call.'
library(dplyr)
Warning: package 'dplyr' was built under R version 3.4.2
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(Matrix)
save_dir = here('00_data_ingest', 'tissue_robj')
droplet_data_dir = here('00_data_ingest', '01_droplet_raw_data')
# read the metadata to get the plates we want
droplet_metadata_filename = here('00_data_ingest', '01_droplet_raw_data', 'metadata_droplet.csv')
droplet_metadata <- read.csv(droplet_metadata_filename, sep=",", header = TRUE)
colnames(droplet_metadata)[1] <- "channel"
droplet_metadata
Subset the metadata on the tissue.
tissue_metadata = filter(droplet_metadata, tissue == tissue_of_interest)[,c('channel','tissue','subtissue','mouse.sex')]
tissue_metadata
Use only the metadata rows corresponding to Bladder plates. Make a plate barcode dataframe to “expand” the per-plate metadata to be per-cell.
# Load the gene names and set the metadata columns by opening the first file
subfolder = paste0(tissue_of_interest, '-', tissue_metadata$channel[1])
raw.data <- Read10X(data.dir = here('00_data_ingest', '01_droplet_raw_data', 'droplet', subfolder))
colnames(raw.data) <- lapply(colnames(raw.data), function(x) paste0(tissue_metadata$channel[1], '_', x))
meta.data = data.frame(row.names = colnames(raw.data))
meta.data['channel'] = tissue_metadata$channel[1]
if (length(tissue_metadata$channel) > 1){
# Some tissues, like Thymus and Heart had only one channel
for(i in 2:nrow(tissue_metadata)){
subfolder = paste0(tissue_of_interest, '-', tissue_metadata$channel[i])
new.data <- Read10X(data.dir = here('00_data_ingest', '01_droplet_raw_data', 'droplet', subfolder))
colnames(new.data) <- lapply(colnames(new.data), function(x) paste0(tissue_metadata$channel[i], '_', x))
new.metadata = data.frame(row.names = colnames(new.data))
new.metadata['channel'] = tissue_metadata$channel[i]
raw.data = cbind(raw.data, new.data)
meta.data = rbind(meta.data, new.metadata)
}
}
rnames = row.names(meta.data)
meta.data <- merge(meta.data, tissue_metadata, sort = F)
row.names(meta.data) <- rnames
dim(raw.data)
[1] 23433 1924
corner(raw.data)
[1] 0 0 0 0 2
head(meta.data)
Process the raw data and load it into the Seurat object.
# Find ERCC's, compute the percent ERCC, and drop them from the raw data.
erccs <- grep(pattern = "^ERCC-", x = rownames(x = raw.data), value = TRUE)
percent.ercc <- Matrix::colSums(raw.data[erccs, ])/Matrix::colSums(raw.data)
ercc.index <- grep(pattern = "^ERCC-", x = rownames(x = raw.data), value = FALSE)
raw.data <- raw.data[-ercc.index,]
# Create the Seurat object with all the data
tiss <- CreateSeuratObject(raw.data = raw.data, project = tissue_of_interest,
min.cells = 1, min.genes = 1)
tiss <- AddMetaData(object = tiss, meta.data)
tiss <- AddMetaData(object = tiss, percent.ercc, col.name = "percent.ercc")
# Change default name for sums of counts from nUMI to nReads
# colnames(tiss@meta.data)[colnames(tiss@meta.data) == 'nUMI'] <- 'nReads'
# Create metadata columns for annotations and subannotations
tiss@meta.data[,'annotation'] <- NA
tiss@meta.data[,'subannotation'] <- NA
Calculate percent ribosomal genes.
ribo.genes <- grep(pattern = "^Rp[sl][[:digit:]]", x = rownames(x = tiss@data), value = TRUE)
percent.ribo <- Matrix::colSums(tiss@raw.data[ribo.genes, ])/Matrix::colSums(tiss@raw.data)
tiss <- AddMetaData(object = tiss, metadata = percent.ribo, col.name = "percent.ribo")
A sanity check: genes per cell vs reads per cell.
GenePlot(object = tiss, gene1 = "nUMI", gene2 = "nGene", use.raw=T)
Filter out cells with few reads and few genes.
tiss <- FilterCells(object = tiss, subset.names = c("nGene", "nUMI"),
low.thresholds = c(500, 2500), high.thresholds = c(25000, 5000000))
Normalize the data, then regress out correlation with total reads
tiss <- NormalizeData(object = tiss)
tiss <- ScaleData(object = tiss, vars.to.regress = c("nUMI", "percent.ribo","Rn45s"))
[1] "Regressing out nUMI" "Regressing out percent.ribo"
[3] "Regressing out Rn45s"
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 11%
|
|======== | 12%
|
|======== | 13%
|
|========= | 14%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 26%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 29%
|
|==================== | 30%
|
|==================== | 31%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 34%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 46%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 51%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 54%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 66%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 69%
|
|============================================== | 70%
|
|============================================== | 71%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 74%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 86%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 91%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 94%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
[1] "Scaling data matrix"
|
| | 0%
|
|=================================================================| 100%
tiss <- FindVariableGenes(object = tiss, do.plot = TRUE, x.high.cutoff = Inf, y.cutoff = 0.5)
Run Principal Component Analysis.
tiss <- RunPCA(object = tiss, do.print = FALSE)
tiss <- ProjectPCA(object = tiss, do.print = FALSE)
Later on (in FindClusters and TSNE) you will pick a number of principal components to use. This has the effect of keeping the major directions of variation in the data and, ideally, supressing noise. There is no correct answer to the number to use, but a decent rule of thumb is to go until the plot plateaus.
PCElbowPlot(object = tiss)
Choose the number of principal components to use.
# Set number of principal components.
n.pcs = 10
The clustering is performed based on a nearest neighbors graph. Cells that have similar expression will be joined together. The Louvain algorithm looks for groups of cells with high modularity–more connections within the group than between groups. The resolution parameter determines the scale…higher resolution will give more clusters, lower resolution will give fewer.
For the top-level clustering, aim to under-cluster instead of over-cluster. It will be easy to subset groups and further analyze them below.
# Set resolution
res.used <- 0.5
tiss <- FindClusters(object = tiss, reduction.type = "pca", dims.use = 1:n.pcs,
resolution = res.used, print.output = 0, save.SNN = TRUE, force.recalc = TRUE)
PrintFindClustersParams(object = tiss)
Parameters used in latest FindClusters calculation run on: 2018-01-22 10:42:35
=============================================================================
Resolution: 0.5
-----------------------------------------------------------------------------
Modularity Function Algorithm n.start n.iter
1 1 100 10
-----------------------------------------------------------------------------
Reduction used k.param k.scale prune.SNN
pca 30 25 0.0667
-----------------------------------------------------------------------------
Dims used in calculation
=============================================================================
1 2 3 4 5 6 7 8 9 10
To visualize
# If cells are too spread out, you can raise the perplexity. If you have few cells, try a lower perplexity (but never less than 10).
tiss <- RunTSNE(object = tiss, dims.use = 1:n.pcs, seed.use = 10, perplexity=30, dim.embed = 2)
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object = tiss, do.label = T, label.size = 7)
BuildClusterTree(tiss)
[1] "Finished averaging RNA for cluster 0"
[1] "Finished averaging RNA for cluster 1"
[1] "Finished averaging RNA for cluster 2"
[1] "Finished averaging RNA for cluster 3"
[1] "Finished averaging RNA for cluster 4"
[1] "Finished averaging RNA for cluster 5"
[1] "Finished averaging RNA for cluster 6"
An object of class seurat in project Liver
13947 genes across 1026 samples.
How big are the clusters?
table(tiss@ident)
0 1 2 3 4 5 6
258 214 203 166 115 50 20
Check expression of genes of interest.
Dotplots let you see the intensity of exppression and the fraction of cells expressing for each of your genes of interest.
Which markers identify a specific cluster?
clust.markers <- FindMarkers(object = tiss, ident.1 = 2, ident.2 = 1, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
print(x = head(x= clust.markers, n = 10))
p_val avg_diff pct.1 pct.2
Malat1 2.354075e-98 2.528658 0.970 0.346
Hspa5 3.963206e-91 1.699159 0.990 0.565
Sod1 2.005082e-85 1.331873 0.975 0.827
Pigr 4.462654e-85 1.442645 0.980 0.682
C3 1.325940e-84 1.465621 0.980 0.864
H2-K1 1.563113e-84 1.236976 1.000 0.860
Tat 6.489161e-84 1.401624 0.970 0.860
Fga 8.437467e-84 1.042293 0.980 0.916
Cyp2c69 4.134723e-81 1.304819 0.966 0.159
P4hb 4.300413e-80 1.452901 0.966 0.579
#write.csv(x = head(x = clust.markers, n = 10), "clus-markers.csv")
# find all markers distinguishing FEMALE vs MALE
cluster_sex.markers <- FindMarkers(object = tiss, ident.1 = c(0,2, 5), ident.2 = c(1, 3, 4),
min.pct = 0.25)
print(x = head(x = cluster_sex.markers, n = 20))
p_val avg_diff pct.1 pct.2
Mup20 2.593971e-265 -3.6273459 0.648 0.960
Cyp2c69 5.438410e-198 2.2088031 0.961 0.174
Xist 3.823751e-181 0.4710078 0.775 0.038
Cyp2d9 2.381740e-170 -2.1541759 0.190 0.887
A1bg 1.780142e-142 1.9498339 0.798 0.053
Elovl3 1.833643e-129 -1.7992863 0.057 0.709
Cyp2b9 2.576932e-129 1.0641712 0.736 0.046
Cyp2c40 9.271837e-129 1.3004062 0.912 0.234
Serpina1e 3.811086e-128 -1.4993187 0.857 0.980
Selenbp2 1.238058e-126 -1.5502394 0.431 0.901
Mup1 1.329531e-121 -1.6583617 0.358 0.851
Gstp1 2.474164e-121 -2.2275370 0.423 0.867
Nudt7 1.763011e-118 -1.0554235 0.714 0.913
Cyp4a12a 7.446231e-105 -1.2645781 0.004 0.558
Fmo3 1.678231e-98 1.2125441 0.597 0.024
Mat1a 6.909099e-94 0.7416253 0.977 0.887
Cyp2a4 5.173767e-85 1.0596223 0.566 0.034
Prlr 1.725661e-82 0.9687795 0.824 0.317
Mup12 5.374523e-79 -1.0973382 0.196 0.685
Mup21 1.048075e-78 -1.2436678 0.127 0.612
#write.csv(x = head(x = cluster_sex.markers, n = 20), "clus-marker-SEX.csv")
You can also compute all markers for all clusters at once. This may take some time.
tiss.markers <- FindAllMarkers(object = tiss, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
Display the top markers you computed above.
top_genes <- tiss.markers %>% group_by(cluster) %>% top_n(20, avg_diff)
print(top_genes)
# A tibble: 140 x 6
# Groups: cluster [7]
p_val avg_diff pct.1 pct.2 cluster gene
<dbl> <dbl> <dbl> <dbl> <fctr> <chr>
1 3.803368e-186 1.8059707 1.000 0.427 0 Cyp2c69
2 9.227436e-114 1.4647938 0.988 0.438 0 Cyp2c37
3 4.384940e-105 1.0349867 1.000 0.694 0 Cyp2c50
4 6.644287e-104 1.1569247 0.996 0.618 0 Cyp2e1
5 1.523527e-91 0.8888426 0.996 0.844 0 Ces1c
6 2.391581e-86 1.0364644 0.705 0.077 0 Cyp7a1
7 6.746948e-85 1.0565346 0.632 0.042 0 Sult3a1
8 2.622495e-75 1.1485160 0.938 0.404 0 Cyp4a14
9 9.279120e-74 1.0056902 0.969 0.501 0 Cyp4a10
10 1.792562e-73 0.9906635 0.961 0.497 0 Cyp2c54
# ... with 130 more rows
At a coarse level, we can use canonical markers to match the unbiased clustering to known cell types:
# stash current cluster IDs
tiss <- StashIdent(object = tiss, save.name = "cluster.ids")
# enumerate current cluster IDs and the labels for them
cluster.ids <- c(0, 1, 2, 3, 4, 5, 6)
annotation <- c("hepatocyte", "hepatocyte","hepatocyte","hepatocyte","hepatocyte","hepatocyte", "endothelial cell")
cell_ontology_id <- c("CL:0000182", "CL:0000182","CL:0000182","CL:0000182","CL:0000182","CL:0000182", "CL:0000115")
tiss@meta.data[,'annotation'] <- plyr::mapvalues(x = tiss@ident, from = cluster.ids, to = annotation)
tiss@meta.data[,'cell_ontology_id'] <- plyr::mapvalues(x = tiss@ident, from = cluster.ids, to = cell_ontology_id)
tiss@meta.data[tiss@cell.names,'annotation'] <- as.character(tiss@meta.data$annotation)
tiss@meta.data[tiss@cell.names,'cell_ontology_id'] <- as.character(tiss@meta.data$cell_ontology_id)
TSNEPlot(object = tiss, do.label = TRUE, pt.size = 0.5, group.by='annotation')
subtiss <- SubsetData(object = tiss, ident.use = c(0,1,2,3,4,5,6), do.center = F, do.scale = F, cells.use = )
subcluster.ids <- c(0, 1, 2, 3, 4, 5, 6)
subannotation <- c("pericentral_female", "midlobular_male", "periportal_female", "periportal_male", "pericentral_male", "midlobular_female", "endothelial_cells")
subtiss@meta.data[,'subannotation'] <- plyr::mapvalues(x = subtiss@ident, from = subcluster.ids, to = subannotation)
tiss@meta.data[subtiss@cell.names,'subannotation'] <- as.character(subtiss@meta.data$subannotation)
TSNEPlot(object = subtiss, do.label = TRUE, pt.size = 0.5, group.by='subannotation')
Color by metadata, like plate barcode, to check for batch effects.
TSNEPlot(object = tiss, do.return = TRUE, group.by = "channel")
TSNEPlot(object = tiss, do.return = TRUE, group.by = "mouse.sex")
Print a table showing the count of cells in each identity category from each plate.
table(as.character(tiss@ident), as.character(tiss@meta.data$channel))
10X_P4_2 10X_P7_0 10X_P7_1
0 0 198 60
1 193 16 5
2 2 114 87
3 146 19 1
4 114 1 0
5 0 29 21
6 5 10 5
[All 10x data are hepatocytes, subcuster into CV/PV/midlobular done above, no further subclustering needed.]
We can repeat the above analysis on a subset of cells, defined using cluster IDs or some other metadata. This is a good way to drill down and find substructure.
# Subset data based on cluster id
### subtiss <- SubsetData(object = tiss, ident.use = c(), do.center = F, do.scale = F, cells.use = )
# To subset data based on annotation or other metadata, you can explicitly pass cell names
cells.to.use = tiss@cell.names[which(tiss@meta.data$mouse.sex == 'F')]
### subtiss <- SubsetData(object = tiss, cells.use = cells.to.use, do.center = F, do.scale = F)
subtiss <- NormalizeData(object = subtiss)
subtiss <- ScaleData(object = subtiss, vars.to.regress = c("nUMI", "percent.ribo","Rn45s"))
[1] "Regressing out nUMI" "Regressing out percent.ribo"
[3] "Regressing out Rn45s"
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 11%
|
|======== | 12%
|
|======== | 13%
|
|========= | 14%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 26%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 29%
|
|==================== | 30%
|
|==================== | 31%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 34%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 46%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 51%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 54%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 66%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 69%
|
|============================================== | 70%
|
|============================================== | 71%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 74%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 86%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 91%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 94%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
[1] "Scaling data matrix"
|
| | 0%
|
|=================================================================| 100%
Run Principal Component Analysis.
subtiss <- FindVariableGenes(object = subtiss, do.plot = TRUE, x.high.cutoff = Inf, y.cutoff = 0.8)
subtiss <- RunPCA(object = subtiss, pcs.compute = 20, weight.by.var = F)
[1] "PC1"
[1] "Mrc1" "Cldn5" "Gatm" "Dnase1l3" "Igfbp7" "Elk3"
[7] "Egfl7" "Oit3" "Clec4g" "Bmp2" "Gpr116" "Pde2a"
[13] "Eng" "Klf2" "Aqp1" "Ptprb" "Tinagl1" "Fcgr2b"
[19] "Cdh5" "Fam167b" "Stab2" "Marcksl1" "F8" "Ramp2"
[25] "Ppap2a" "Cd55" "Eltd1" "Gpihbp1" "Gimap6" "Esam"
[1] ""
[1] "Fabp1" "Mup3" "Bhmt" "Hrsp12" "Angptl3" "Acaa1b" "Arg1"
[8] "Hamp" "Apoc2" "Ass1" "Mif" "Mup2" "Gsta4" "Rgn"
[15] "Krt18" "Uqcrb" "Cox6b1" "Cyp2c29" "S100a6" "Gstp1" "Pck1"
[22] "Hamp2" "Scd1" "Sult1a1" "Ly6d" "A1bg" "Thrsp" "Sprr1a"
[29] "G0s2" "Cyp2c40"
[1] ""
[1] ""
[1] "PC2"
[1] "Ctss" "Vsig4" "C1qc" "C1qa" "Folr2" "Cd5l"
[7] "Clec4f" "Rgs2" "Gm11428" "Itgal" "C1qb" "Tyrobp"
[13] "Fcna" "Lyz2" "Csf1r" "Fabp7" "Lyz1" "Cybb"
[19] "Cfp" "H2-Aa" "Cd74" "Ccl6" "Laptm5" "Mpeg1"
[25] "Sdc3" "Ifi27l2a" "H2-Ab1" "Fcer1g" "Aif1" "H2-Eb1"
[1] ""
[1] "Clec4g" "Ptprb" "Tinagl1" "Dnase1l3" "Egfl7" "Eng"
[7] "Pde2a" "Fam167b" "Fcgr2b" "Gpr182" "Igfbp7" "Cd300lg"
[13] "Ppap2a" "Gpihbp1" "Eltd1" "Ramp2" "Gimap6" "Tspan7"
[19] "Adam23" "Oit3" "Cldn5" "F8" "Cyp4b1" "Esam"
[25] "Aqp1" "Bmp2" "Kdr" "Gpr116" "Hspg2" "Slc43a3"
[1] ""
[1] ""
[1] "PC3"
[1] "Gstp1" "Hrsp12" "Cyp2f2" "Mup20" "Sds"
[6] "Hsd17b13" "Mup3" "Serpina12" "Apoc2" "Selenbp2"
[11] "Hsd17b6" "Ubb" "Mup21" "Ass1" "Cox6b1"
[16] "Mup12" "Gls2" "Hal" "Ifitm2" "Mup1"
[21] "Uqcrb" "Arg1" "Gsta4" "Rpl17" "Agxt2l1"
[26] "Foxq1" "Rps6" "Mif" "Gm1821" "Tstd1"
[1] ""
[1] "Slco1b2" "Cyp2c37" "Cyp2e1" "Gulo" "Cyp2c50" "Cyp4a14" "Cyp1a2"
[8] "Cyp2c29" "Cyp2c69" "Cyp3a11" "Ces1c" "Oat" "Rnase4" "Aldh3a2"
[15] "Rgn" "Cyp7a1" "Aldh1a1" "Lect2" "Sult3a1" "Xist" "Ces3a"
[22] "Mug1" "Cyp2c40" "Cyp2a5" "Pzp" "Cyp2b13" "Slc22a1" "Egr1"
[29] "C3" "Cyp3a44"
[1] ""
[1] ""
[1] "PC4"
[1] "A1bg" "Fmo3" "Cyp2a4" "Cyp2b9" "Hal" "Cyp2c69"
[7] "Uroc1" "Gls2" "Xist" "Apoa4" "Pck1" "Acly"
[13] "Cyp2c40" "Ctsc" "Hamp2" "Got1" "Sult2a1" "Hsd17b6"
[19] "Cyp2f2" "Malat1" "Aldh1b1" "Arg1" "Ass1" "Agxt2l1"
[25] "Sds" "Mt1" "Hsd17b13" "Cxcl12" "Sult2a2" "Cdh1"
[1] ""
[1] "Cyp2d9" "Elovl3" "Mup1" "Mup2"
[5] "Mup20" "Hsd3b5" "Selenbp2" "Cyp4a12a"
[9] "Nudt7" "Gstp1" "Cyp7b1" "Slco1a1"
[13] "Cyp2e1" "Fitm1" "Fabp1" "Acaa1b"
[17] "Mup21" "Mup12" "C6" "Cyp2c67"
[21] "Blvrb" "Ttc39c" "Oat" "Slc22a1"
[25] "Alas1" "Serpina4-ps1" "Cyp1a2" "Gulo"
[29] "Mup3" "Ugt2b1"
[1] ""
[1] ""
[1] "PC5"
[1] "Ubb" "Uqcrb" "Hamp" "Angptl3"
[5] "Gm1821" "Gm6222" "Fabp1" "Rgn"
[9] "Sult1a1" "Mif" "Cyp2c69" "Cox6b1"
[13] "Bhmt" "Hpgd" "Lect2" "Sult2a2"
[17] "Sult3a1" "Blvrb" "Rpl23a" "Cyp2c40"
[21] "Sult2a5" "AW112010" "Cyp2c50" "A130040M12Rik"
[25] "Cyp2c37" "Ang" "Rps13" "Ifitm2"
[29] "Hamp2" "Arg1"
[1] ""
[1] "Lpin1" "C3" "Cyp2f2" "Serpina4-ps1"
[5] "Hc" "Scd1" "Ces3a" "Plg"
[9] "Ces3b" "Slc7a2" "Foxq1" "Hsd17b13"
[13] "C4b" "Clu" "Cdh1" "Mug1"
[17] "Mup20" "Malat1" "Cyp2d9" "Slc38a2"
[21] "Aspg" "Ttc39c" "Egr1" "Ugt2b5"
[25] "Cyp4a12a" "Hal" "Pck1" "S100a6"
[29] "Retsat" "Aox3"
[1] ""
[1] ""
subtiss <- ProjectPCA(object = subtiss, do.print = FALSE)
# If this fails for your subset, it may be that cells.use is more cells than you have left! Try reducing it.
PCHeatmap(object = subtiss, pc.use = 1:3, cells.use = 250, do.balanced = TRUE, label.columns = FALSE, num.genes = 12)
Later on (in FindClusters and TSNE) you will pick a number of principal components to use. This has the effect of keeping the major directions of variation in the data and, ideally, supressing noise. There is no correct answer to the number to use, but a decent rule of thumb is to go until the plot plateaus.
PCElbowPlot(object = subtiss)
Choose the number of principal components to use.
# Set number of principal components.
sub.n.pcs = 10
The clustering is performed based on a nearest neighbors graph. Cells that have similar expression will be joined together. The Louvain algorithm looks for groups of cells with high modularity–more connections within the group than between groups. The resolution parameter determines the scale…higher resolution will give more clusters, lower resolution will give fewer.
# Set resolution
sub.res.used <- 0.5
subtiss <- FindClusters(object = subtiss, reduction.type = "pca", dims.use = 1:sub.n.pcs,
resolution = sub.res.used, ,print.output = 0, save.SNN = TRUE, force.recalc = TRUE)
Warning in BuildSNN(object = object, genes.use = genes.use, reduction.type
= reduction.type, : Build parameters exactly match those of already
computed and stored SNN. To force recalculation, set force.recalc to TRUE.
PrintFindClustersParams(object = subtiss)
Parameters used in latest FindClusters calculation run on: 2018-01-22 10:48:02
=============================================================================
Resolution: 0.5
-----------------------------------------------------------------------------
Modularity Function Algorithm n.start n.iter
1 1 100 10
-----------------------------------------------------------------------------
Reduction used k.param k.scale prune.SNN
pca 30 25 0.0667
-----------------------------------------------------------------------------
Dims used in calculation
=============================================================================
1 2 3 4 5 6 7 8 9 10
To visualize
# If cells are too spread out, you can raise the perplexity. If you have few cells, try a lower perplexity (but never less than 10).
subtiss <- RunTSNE(object = subtiss, dims.use = 1:sub.n.pcs, seed.use = 10, perplexity=20)
# If cells are too spread out, you can raise the perplexity. If you have few cells, try a lower perplexity (but never less than 10).
#zonated = c('Alb', 'Ass1','Asl', 'Cyp2f2', 'Cyp2e1', 'Glul', 'Clec4g', 'C1qa')
#more_zonated = c('Glul', 'Cyp2e1','Cyp2f2', 'Alb', 'Axin2','Oat','Ass1','Asl')
#subtiss <- RunTSNE(object = subtiss, seed.use = 10, perplexity=20, genes.use = zonated)
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object = subtiss, do.label = T)
subtiss.markers <- FindAllMarkers(object = subtiss, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
subtiss.markers %>% group_by(cluster) %>% top_n(6, avg_diff)
Check expression of genes of interset.
genes_to_check = c('Alb', 'Cyp2f2', 'Cyp2e1', 'Hamp', 'Glul', 'Ass1', 'Slc1a2', 'Hal', 'Igfbp2', 'Pecam1', 'C1qa')
FeaturePlot(subtiss, genes_to_check, pt.size = 1)
Dotplots let you see the intensity of exppression and the fraction of cells expressing for each of your genes of interest.
# To change the y-axis to show raw counts, add use.raw = T.
DotPlot(subtiss, genes_to_check, plot.legend = T)
How big are the clusters?
table(subtiss@ident)
0 1 2 3 4 5 6
258 214 203 166 115 50 20
Color by metadata, like plate barcode, to check for batch effects.
TSNEPlot(object = subtiss, do.return = TRUE, group.by = "channel")
Print a table showing the count of cells in each identity category from each plate.
table(as.character(subtiss@ident), as.character(subtiss@meta.data$channel))
10X_P4_2 10X_P7_0 10X_P7_1
0 17 165 76
1 175 29 10
2 15 122 66
3 133 27 6
4 110 5 0
5 3 31 16
6 7 8 5
When you save the annotated tissue, please give it a name.
filename = here('00_data_ingest', '04_tissue_robj_generated',
paste0(tissue_of_interest, "_droplet_seurat_tiss.Robj"))
print(filename)
[1] "/Users/olgabot/code/tabula-muris/00_data_ingest/04_tissue_robj_generated/Liver_droplet_seurat_tiss.Robj"
save(tiss, file=filename)
# To reload a saved object
# filename = here('00_data_ingest', '04_tissue_robj_generated',
# paste0(tissue_of_interest, "_seurat_tiss.Robj"))
# load(file=filename)
So that Biohub can easily combine all your annotations, please export them as a simple csv.
head(tiss@meta.data)
filename = here('00_data_ingest', '03_tissue_annotation_csv',
paste0(tissue_of_interest, "_droplet_annotation.csv"))
write.csv(tiss@meta.data[,c('channel','annotation','cell_ontology_id')], file=filename)